home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
AmigActive 2
/
AACD 2.iso
/
AACD
/
Magazine
/
GraphicsCards
/
StormMesa
/
src
/
vbxform_asmppc.p
< prev
next >
Wrap
Text File
|
1999-02-04
|
58KB
|
1,774 lines
;
; Mesa 3-D graphics library
; Version: 2.5
; Copyright (C) 1995-1997 Brian Paul
;
; This library is free software; you can redistribute it and/or
; modify it under the terms of the GNU Library General Public
; License as published by the Free Software Foundation; either
; version 2 of the License, or (at your option) any later version.
;
; This library is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
; Library General Public License for more details.
;
; You should have received a copy of the GNU Library General Public
; License along with this library; if not, write to the Free
; Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
;
; vbxform_asmppc.p
; 16.5.1998 by Sam Jordan
;
; PowerPC assembler optimizations of several functions in vbxform.c
; Originally written for AMIGA OS/PowerOpen. To use this source on other
; PowerPC based platforms or with other programming models, some
; modifications might be needed.
CLIP_RIGHT_BIT = $01
CLIP_LEFT_BIT = $02
CLIP_TOP_BIT = $04
CLIP_BOTTOM_BIT = $08
CLIP_NEAR_BIT = $10
CLIP_FAR_BIT = $20
XDEF _asm_transform_points3_general
XDEF _asm_transform_points3_identity
XDEF _asm_transform_points3_2d
XDEF _asm_transform_points3_2d_no_rot
XDEF _asm_transform_points3_3d
XDEF _asm_transform_points4_general
XDEF _asm_transform_points4_identity
XDEF _asm_transform_points4_2d
XDEF _asm_transform_points4_2d_no_rot
XDEF _asm_transform_points4_3d
XDEF _asm_project_and_cliptest_general
XDEF _asm_project_and_cliptest_identity
XDEF _asm_project_and_cliptest_ortho
XDEF _asm_project_and_cliptest_perspective
XDEF _asm_vp_map_vertices_now
XDEF _asm_vp_map_vertices
_asm_transform_points3_general
; const GLfloat *m = ctx->ModelViewMatrix;
; GLfloat m0 = m[0], m4 = m[4], m8 = m[8], m12 = m[12];
; GLfloat m1 = m[1], m5 = m[5], m9 = m[9], m13 = m[13];
; GLfloat m2 = m[2], m6 = m[6], m10 = m[10], m14 = m[14];
; GLfloat m3 = m[3], m7 = m[7], m11 = m[11], m15 = m[15];
; GLuint i;
; for (i=0;i<n;i++) {
; GLfloat ox = vObj[i][0], oy = vObj[i][1], oz = vObj[i][2];
; vEye[i][0] = m0 * ox + m4 * oy + m8 * oz + m12;
; vEye[i][1] = m1 * ox + m5 * oy + m9 * oz + m13;
; vEye[i][2] = m2 * ox + m6 * oy + m10 * oz + m14;
; vEye[i][3] = m3 * ox + m7 * oy + m11 * oz + m15;
; }
stfd f14,-1*8(r1)
stfd f15,-2*8(r1)
stfd f16,-3*8(r1)
stfd f17,-4*8(r1)
stfd f18,-5*8(r1)
stfd f19,-6*8(r1)
stfd f20,-7*8(r1)
stfd f21,-8*8(r1)
stfd f22,-9*8(r1)
mr. r3,r3
beq .end
subi r5,r5,4
lfsu f0,4(r5)
lfsu f1,4(r5)
lfsu f2,4(r5)
lfsu f3,4(r5)
lfsu f4,4(r5)
lfsu f5,4(r5)
lfsu f6,4(r5)
lfsu f7,4(r5)
lfsu f8,4(r5)
lfsu f9,4(r5)
lfsu f10,4(r5)
lfsu f11,4(r5)
lfsu f12,4(r5)
lfsu f13,4(r5)
lfsu f14,4(r5)
lfsu f15,4(r5)
mtctr r3
subi r6,r6,4
subi r4,r4,4
.loop
lfsu f16,4(r6) ;f16 = ox
fmadds f19,f0,f16,f12
fmadds f20,f1,f16,f13
lfsu f17,4(r6) ;f17 = oy
fmadds f21,f2,f16,f14
fmadds f22,f3,f16,f15
fmadds f19,f4,f17,f19
fmadds f20,f5,f17,f20
lfsu f18,4(r6) ;f18 = oz
fmadds f21,f6,f17,f21
fmadds f22,f7,f17,f22
addi r6,r6,4
fmadds f19,f8,f18,f19
stfsu f19,4(r4)
fmadds f20,f9,f18,f20
stfsu f20,4(r4)
fmadds f21,f10,f18,f21
stfsu f21,4(r4)
fmadds f22,f11,f18,f22
stfsu f22,4(r4)
bdnz .loop
.end
lfd f22,-9*8(r1)
lfd f21,-8*8(r1)
lfd f20,-7*8(r1)
lfd f19,-6*8(r1)
lfd f18,-5*8(r1)
lfd f17,-4*8(r1)
lfd f16,-3*8(r1)
lfd f15,-2*8(r1)
lfd f14,-1*8(r1)
blr
_asm_transform_points3_identity
; GLuint i;
; for (i=0;i<n;i++) {
; vEye[i][0] = vObj[i][0];
; vEye[i][1] = vObj[i][1];
; vEye[i][2] = vObj[i][2];
; vEye[i][3] = 1.0F;
; }
mr. r3,r3
beq .end
subi r4,r4,4
subi r5,r5,4
mtctr r3
lis r6,$3f80
.loop
lwzu r0,4(r5)
stwu r0,4(r4)
lwzu r0,4(r5)
stwu r0,4(r4)
lwzu r0,4(r5)
stwu r0,4(r4)
addi r5,r5,4
stwu r6,4(r4)
bdnz .loop
.end
blr
_asm_transform_points3_2d
; const GLfloat *m = ctx->ModelViewMatrix;
; GLfloat m0 = m[0], m1 = m[1], m4 = m[4], m5 = m[5];
; GLfloat m12 = m[12], m13 = m[13];
; GLuint i;
; for (i=0;i<n;i++) {
; GLfloat ox = vObj[i][0], oy = vObj[i][1], oz = vObj[i][2];
; vEye[i][0] = m0 * ox + m4 * oy + m12 ;
; vEye[i][1] = m1 * ox + m5 * oy + m13 ;
; vEye[i][2] = + oz ;
; vEye[i][3] = 1.0F;
; }
mr. r3,r3
beq .end
lfs f0,0*4(r5)
lfs f1,1*4(r5)
lfs f4,4*4(r5)
lfs f5,5*4(r5)
lfs f12,12*4(r5)
lfs f13,13*4(r5)
mtctr r3
subi r6,r6,4
subi r4,r4,4
lis r7,$3f80
.loop
lfsu f6,4(r6) ;f6 = ox
fmadds f2,f0,f6,f12
fmadds f3,f1,f6,f13
lfsu f7,4(r6) ;f7 = oy
fmadds f2,f4,f7,f2
lfsu f8,4(r6) ;f8 = oz
fmadds f3,f5,f7,f3
stfsu f2,4(r4)
stfsu f3,4(r4)
stfsu f8,4(r4)
addi r6,r6,4
stwu r7,4(r4)
bdnz .loop
.end
blr
_asm_transform_points3_2d_no_rot
; const GLfloat *m = ctx->ModelViewMatrix;
; GLfloat m0 = m[0], m5 = m[5], m12 = m[12], m13 = m[13];
; GLuint i;
; for (i=0;i<n;i++) {
; GLfloat ox = vObj[i][0], oy = vObj[i][1], oz = vObj[i][2];
; vEye[i][0] = m0 * ox + m12 ;
; vEye[i][1] = m5 * oy + m13 ;
; vEye[i][2] = + oz ;
; vEye[i][3] = 1.0F;
; }
mr. r3,r3
beq .end
lfs f0,0*4(r5)
lfs f5,5*4(r5)
lfs f12,12*4(r5)
lfs f13,13*4(r5)
mtctr r3
subi r6,r6,4
subi r4,r4,4
lis r7,$3f80
.loop
lfsu f6,4(r6) ;f6 = ox
fmadds f2,f0,f6,f12
lfsu f7,4(r6) ;f7 = oy
fmadds f3,f5,f7,f13
lfsu f8,4(r6) ;f7 = oy
stfsu f2,4(r4)
stfsu f3,4(r4)
stfsu f8,4(r4)
addi r6,r6,4
stwu r7,4(r4)
bdnz .loop
.end
blr
_asm_transform_points3_3d
; const GLfloat *m = ctx->ModelViewMatrix;
; GLfloat m0 = m[0], m1 = m[1], m2 = m[2], m4 = m[4], m5 = m[5];
; GLfloat m6 = m[6], m8 = m[8], m9 = m[9], m10 = m[10];
; GLfloat m12 = m[12], m13 = m[13], m14 = m[14];
; GLuint i;
; for (i=0;i<n;i++) {
; GLfloat ox = vObj[i][0], oy = vObj[i][1], oz = vObj[i][2];
; vEye[i][0] = m0 * ox + m4 * oy + m8 * oz + m12 ;
; vEye[i][1] = m1 * ox + m5 * oy + m9 * oz + m13 ;
; vEye[i][2] = m2 * ox + m6 * oy + m10 * oz + m14 ;
; vEye[i][3] = 1.0F;
; }
stfd f14,-1*8(r1)
stfd f15,-2*8(r1)
stfd f16,-3*8(r1)
stfd f17,-4*8(r1)
mr. r3,r3
beq .end
subi r5,r5,4
lfsu f0,4(r5)
lfsu f1,4(r5)
lfsu f2,4(r5)
addi r5,r5,4
lfsu f4,4(r5)
lfsu f5,4(r5)
lfsu f6,4(r5)
addi r5,r5,4
lfsu f8,4(r5)
lfsu f9,4(r5)
lfsu f10,4(r5)
addi r5,r5,4
lfsu f12,4(r5)
lfsu f13,4(r5)
lfsu f14,4(r5)
mtctr r3
subi r6,r6,4
subi r4,r4,4
lis r7,$3f80
.loop
lfsu f3,4(r6) ;f3 = ox
fmadds f15,f0,f3,f12
fmadds f16,f1,f3,f13
lfsu f7,4(r6) ;f7 = oy
fmadds f17,f2,f3,f14
fmadds f15,f4,f7,f15
lfsu f11,4(r6) ;f11 = oz
fmadds f16,f5,f7,f16
fmadds f17,f6,f7,f17
fmadds f15,f8,f11,f15
stfsu f15,4(r4)
fmadds f16,f9,f11,f16
addi r6,r6,4
stfsu f16,4(r4)
fmadds f17,f10,f11,f17
stfsu f17,4(r4)
stwu r7,4(r4)
bdnz .loop
.end
lfd f17,-4*8(r1)
lfd f16,-3*8(r1)
lfd f15,-2*8(r1)
lfd f14,-1*8(r1)
blr
_asm_transform_points4_general
; const GLfloat *m = ctx->ModelViewMatrix;
; GLfloat m0 = m[0], m4 = m[4], m8 = m[8], m12 = m[12];
; GLfloat m1 = m[1], m5 = m[5], m9 = m[9], m13 = m[13];
; GLfloat m2 = m[2], m6 = m[6], m10 = m[10], m14 = m[14];
; GLfloat m3 = m[3], m7 = m[7], m11 = m[11], m15 = m[15];
; GLuint i;
; for (i=0;i<n;i++) {
; GLfloat ox = vObj[i][0], oy = vObj[i][1];
; GLfloat oz = vObj[i][2], ow = vObj[i][3];
; vEye[i][0] = m0 * ox + m4 * oy + m8 * oz + m12 * ow;
; vEye[i][1] = m1 * ox + m5 * oy + m9 * oz + m13 * ow;
; vEye[i][2] = m2 * ox + m6 * oy + m10 * oz + m14 * ow;
; vEye[i][3] = m3 * ox + m7 * oy + m11 * oz + m15 * ow;
; }
stfd f14,-1*8(r1)
stfd f15,-2*8(r1)
stfd f16,-3*8(r1)
stfd f17,-4*8(r1)
stfd f18,-5*8(r1)
stfd f19,-6*8(r1)
stfd f20,-7*8(r1)
stfd f21,-8*8(r1)
stfd f22,-9*8(r1)
stfd f23,-10*8(r1)
mr. r3,r3
beq .end
subi r5,r5,4
lfsu f0,4(r5)
lfsu f1,4(r5)
lfsu f2,4(r5)
lfsu f3,4(r5)
lfsu f4,4(r5)
lfsu f5,4(r5)
lfsu f6,4(r5)
lfsu f7,4(r5)
lfsu f8,4(r5)
lfsu f9,4(r5)
lfsu f10,4(r5)
lfsu f11,4(r5)
lfsu f12,4(r5)
lfsu f13,4(r5)
lfsu f14,4(r5)
lfsu f15,4(r5)
mtctr r3
subi r6,r6,4
subi r4,r4,4
.loop
lfsu f16,4(r6) ;f16 = ox
fmuls f19,f0,f16
fmuls f20,f1,f16
lfsu f17,4(r6) ;f17 = oy
fmuls f21,f2,f16
fmuls f22,f3,f16
fmadds f19,f4,f17,f19
lfsu f18,4(r6) ;f18 = oz
fmadds f20,f5,f17,f20
fmadds f21,f6,f17,f21
fmadds f22,f7,f17,f22
lfsu f23,4(r6) ;f23 = ow
fmadds f19,f8,f18,f19
fmadds f20,f9,f18,f20
fmadds f21,f10,f18,f21
fmadds f22,f11,f18,f22
fmadds f19,f12,f23,f19
stfsu f19,4(r4)
fmadds f20,f13,f23,f20
stfsu f20,4(r4)
fmadds f21,f14,f23,f21
stfsu f21,4(r4)
fmadds f22,f15,f23,f22
stfsu f22,4(r4)
bdnz .loop
.end
lfd f23,-10*8(r1)
lfd f22,-9*8(r1)
lfd f21,-8*8(r1)
lfd f20,-7*8(r1)
lfd f19,-6*8(r1)
lfd f18,-5*8(r1)
lfd f17,-4*8(r1)
lfd f16,-3*8(r1)
lfd f15,-2*8(r1)
lfd f14,-1*8(r1)
blr
_asm_transform_points4_identity
; GLuint i;
; for (i=0;i<n;i++) {
; vEye[i][0] = vObj[i][0];
; vEye[i][1] = vObj[i][1];
; vEye[i][2] = vObj[i][2];
; vEye[i][3] = vObj[i][3];
; }
mr. r3,r3
beq .end
subi r4,r4,4
subi r5,r5,4
mtctr r3
.loop
lwzu r0,4(r5)
stwu r0,4(r4)
lwzu r0,4(r5)
stwu r0,4(r4)
lwzu r0,4(r5)
stwu r0,4(r4)
lwzu r0,4(r5)
stwu r0,4(r4)
bdnz .loop
.end
blr
_asm_transform_points4_2d
; const GLfloat *m = ctx->ModelViewMatrix;
; GLfloat m0 = m[0], m1 = m[1], m4 = m[4], m5 = m[5];
; GLfloat m12 = m[12], m13 = m[13];
; GLuint i;
; for (i=0;i<n;i++) {
; GLfloat ox = vObj[i][0], oy = vObj[i][1];
; GLfloat oz = vObj[i][2], ow = vObj[i][3];
; vEye[i][0] = m0 * ox + m4 * oy + m12 * ow;
; vEye[i][1] = m1 * ox + m5 * oy + m13 * ow;
; vEye[i][2] = + oz ;
; vEye[i][3] = ow;
mr. r3,r3
beq .end
lfs f0,0*4(r5)
lfs f1,1*4(r5)
lfs f4,4*4(r5)
lfs f5,5*4(r5)
lfs f12,12*4(r5)
lfs f13,13*4(r5)
mtctr r3
subi r6,r6,4
subi r4,r4,4
.loop
lfsu f6,4(r6) ;f6 = ox
fmuls f2,f0,f6
lfsu f7,4(r6) ;f7 = oy
fmuls f3,f1,f6
lfsu f8,4(r6) ;f8 = oz
fmadds f2,f4,f7,f2
lfsu f9,4(r6) ;f9 = ow
fmadds f3,f5,f7,f3
fmadds f2,f12,f9,f2
stfsu f2,4(r4)
fmadds f3,f13,f9,f3
stfsu f3,4(r4)
stfsu f8,4(r4)
stfsu f9,4(r4)
bdnz .loop
.end
blr
_asm_transform_points4_2d_no_rot
; const GLfloat *m = ctx->ModelViewMatrix;
; GLfloat m0 = m[0], m5 = m[5], m12 = m[12], m13 = m[13];
; GLuint i;
; for (i=0;i<n;i++) {
; GLfloat ox = vObj[i][0], oy = vObj[i][1];
; GLfloat oz = vObj[i][2], ow = vObj[i][3];
; vEye[i][0] = m0 * ox + m12 * ow;
; vEye[i][1] = m5 * oy + m13 * ow;
; vEye[i][2] = + oz ;
; vEye[i][3] = ow;
; }
mr. r3,r3
beq .end
lfs f0,0*4(r5)
lfs f5,5*4(r5)
lfs f12,12*4(r5)
lfs f13,13*4(r5)
mtctr r3
subi r6,r6,4
subi r4,r4,4
.loop
lfsu f6,4(r6) ;f6 = ox
lfsu f7,4(r6) ;f7 = oy
fmuls f2,f0,f6
lfsu f8,4(r6) ;f7 = oy
fmuls f3,f5,f7
lfsu f9,4(r6) ;f9 = ow
fmadds f2,f12,f9,f2
stfsu f2,4(r4)
fmadds f3,f13,f9,f3
stfsu f3,4(r4)
stfsu f8,4(r4)
stfsu f9,4(r4)
bdnz .loop
.end
blr
_asm_transform_points4_3d
; const GLfloat *m = ctx->ModelViewMatrix;
; GLfloat m0 = m[0], m1 = m[1], m2 = m[2], m4 = m[4], m5 = m[5];
; GLfloat m6 = m[6], m8 = m[8], m9 = m[9], m10 = m[10];
; GLfloat m12 = m[12], m13 = m[13], m14 = m[14];
; GLuint i;
; for (i=0;i<n;i++) {
; GLfloat ox = vObj[i][0], oy = vObj[i][1];
; GLfloat oz = vObj[i][2], ow = vObj[i][3];
; vEye[i][0] = m0 * ox + m4 * oy + m8 * oz + m12 * ow;
; vEye[i][1] = m1 * ox + m5 * oy + m9 * oz + m13 * ow;
; vEye[i][2] = m2 * ox + m6 * oy + m10 * oz + m14 * ow;
; vEye[i][3] = ow;
; }
stfd f14,-1*8(r1)
stfd f15,-2*8(r1)
stfd f16,-3*8(r1)
stfd f17,-4*8(r1)
stfd f18,-5*8(r1)
mr. r3,r3
beq .end
subi r5,r5,4
lfsu f0,4(r5)
lfsu f1,4(r5)
lfsu f2,4(r5)
addi r5,r5,4
lfsu f4,4(r5)
lfsu f5,4(r5)
lfsu f6,4(r5)
addi r5,r5,4
lfsu f8,4(r5)
lfsu f9,4(r5)
lfsu f10,4(r5)
addi r5,r5,4
lfsu f12,4(r5)
lfsu f13,4(r5)
lfsu f14,4(r5)
mtctr r3
subi r6,r6,4
subi r4,r4,4
.loop
lfsu f3,4(r6) ;f3 = ox
fmuls f15,f0,f3
fmuls f16,f1,f3
lfsu f7,4(r6) ;f7 = oy
fmuls f17,f2,f3
fmadds f15,f4,f7,f15
lfsu f11,4(r6) ;f11 = oz
fmadds f16,f5,f7,f16
fmadds f17,f6,f7,f17
fmadds f15,f8,f11,f15
lfsu f18,4(r6) ;f18 = ow
fmadds f16,f9,f11,f16
fmadds f17,f10,f11,f17
fmadds f15,f12,f18,f15
stfsu f15,4(r4)
fmadds f16,f13,f18,f16
stfsu f16,4(r4)
fmadds f17,f14,f18,f17
stfsu f17,4(r4)
stfsu f18,4(r4)
bdnz .loop
.end
lfd f18,-5*8(r1)
lfd f17,-4*8(r1)
lfd f16,-3*8(r1)
lfd f15,-2*8(r1)
lfd f14,-1*8(r1)
blr
_asm_project_and_cliptest_general
; const GLfloat *m = ctx->ProjectionMatrix;
; GLfloat m0 = m[0], m4 = m[4], m8 = m[8], m12 = m[12];
; GLfloat m1 = m[1], m5 = m[5], m9 = m[9], m13 = m[13];
; GLfloat m2 = m[2], m6 = m[6], m10 = m[10], m14 = m[14];
; GLfloat m3 = m[3], m7 = m[7], m11 = m[11], m15 = m[15];
; GLuint i;
; for (i=0;i<n;i++) {
; GLfloat ex = vEye[i][0], ey = vEye[i][1];
; GLfloat ez = vEye[i][2], ew = vEye[i][3];
; GLfloat cx = m0 * ex + m4 * ey + m8 * ez + m12 * ew;
; GLfloat cy = m1 * ex + m5 * ey + m9 * ez + m13 * ew;
; GLfloat cz = m2 * ex + m6 * ey + m10 * ez + m14 * ew;
; GLfloat cw = m3 * ex + m7 * ey + m11 * ez + m15 * ew;
; GLubyte mask = 0;
; vClip[i][0] = cx;
; vClip[i][1] = cy;
; vClip[i][2] = cz;
; vClip[i][3] = cw;
; if (cx > cw) mask |= CLIP_RIGHT_BIT;
; else if (cx < -cw) mask |= CLIP_LEFT_BIT;
; if (cy > cw) mask |= CLIP_TOP_BIT;
; else if (cy < -cw) mask |= CLIP_BOTTOM_BIT;
; if (cz > cw) mask |= CLIP_FAR_BIT;
; else if (cz < -cw) mask |= CLIP_NEAR_BIT;
; if (mask) {
; clipMask[i] |= mask;
; tmpOrMask |= mask;
; }
; tmpAndMask &= mask;
; }
IFNE 1
stfd f14,-1*8(r1)
stfd f15,-2*8(r1)
stfd f16,-3*8(r1)
stfd f17,-4*8(r1)
stfd f18,-5*8(r1)
stfd f19,-6*8(r1)
stfd f20,-7*8(r1)
stfd f21,-8*8(r1)
stfd f22,-9*8(r1)
stfd f23,-10*8(r1)
mr. r3,r3
beq .end
subi r5,r5,4
lfsu f0,4(r5)
lfsu f1,4(r5)
lfsu f2,4(r5)
lfsu f3,4(r5)
lfsu f4,4(r5)
lfsu f5,4(r5)
lfsu f6,4(r5)
lfsu f7,4(r5)
lfsu f8,4(r5)
lfsu f9,4(r5)
lfsu f10,4(r5)
lfsu f11,4(r5)
lfsu f12,4(r5)
lfsu f13,4(r5)
lfsu f14,4(r5)
lfsu f15,4(r5)
mtctr r3
subi r6,r6,4
subi r4,r4,4
lbz r11,0(r8)
lbz r12,0(r9)
.loop
li r10,0
lfsu f16,4(r6) ;f16 = ex
fmuls f19,f0,f16
fmuls f20,f1,f16
lfsu f17,4(r6) ;f17 = ey
fmuls f21,f2,f16
fmuls f22,f3,f16
fmadds f19,f4,f17,f19
lfsu f18,4(r6) ;f18 = ez
fmadds f20,f5,f17,f20
fmadds f21,f6,f17,f21
fmadds f22,f7,f17,f22
lfsu f23,4(r6) ;f23 = ew
fmadds f19,f8,f18,f19
fmadds f20,f9,f18,f20
fmadds f21,f10,f18,f21
fmadds f22,f11,f18,f22
fmadds f19,f12,f23,f19 ;f19 = cx
stfsu f19,4(r4)
fmadds f20,f13,f23,f20 ;f20 = cy
stfsu f20,4(r4)
fmadds f21,f14,f23,f21 ;f21 = cz
stfsu f21,4(r4)
fmadds f22,f15,f23,f22 ;f22 = cw
stfsu f22,4(r4)
fabs f16,f19
fabs f17,f20
fabs f18,f21
fsubs f23,f16,f17
fsel f16,f23,f16,f17
fsubs f17,f16,f18
fsel f18,f17,f16,f18
fcmpu f18,f22
bgt .check
li r10,0
b .cont7
.check
fneg f16,f22 ;f16 = -cw
fcmpu f19,f22
ble .cont1
ori r10,r10,CLIP_RIGHT_BIT
b .cont2
.cont1
fcmpu f19,f16
bge .cont2
ori r10,r10,CLIP_LEFT_BIT
.cont2
fcmpu f20,f22
ble .cont3
ori r10,r10,CLIP_TOP_BIT
b .cont4
.cont3
fcmpu f20,f16
bge .cont4
ori r10,r10,CLIP_BOTTOM_BIT
.cont4
fcmpu f21,f22
ble .cont5
ori r10,r10,CLIP_FAR_BIT
b .cont6
.cont5
fcmpu f21,f16
bge .cont6
ori r10,r10,CLIP_NEAR_BIT
.cont6
mr. r10,r10
beq .cont7
lbz r0,0(r7)
or r0,r0,r10
stb r0,0(r7)
or r11,r11,r10
.cont7
and r12,r12,r10
addi r7,r7,1
bdnz .loop
stb r11,0(r8)
stb r12,0(r9)
.end
lfd f23,-10*8(r1)
lfd f22,-9*8(r1)
lfd f21,-8*8(r1)
lfd f20,-7*8(r1)
lfd f19,-6*8(r1)
lfd f18,-5*8(r1)
lfd f17,-4*8(r1)
lfd f16,-3*8(r1)
lfd f15,-2*8(r1)
lfd f14,-1*8(r1)
blr
ELSEIF
stfd f14,-1*8(r1)
stfd f15,-2*8(r1)
stfd f16,-3*8(r1)
stfd f17,-4*8(r1)
stfd f18,-5*8(r1)
stfd f19,-6*8(r1)
stfd f20,-7*8(r1)
stfd f21,-8*8(r1)
stfd f22,-9*8(r1)
stfd f23,-10*8(r1)
mr. r3,r3
beq .end
subi r5,r5,4
lfsu f0,4(r5)
lfsu f1,4(r5)
lfsu f2,4(r5)
lfsu f3,4(r5)
lfsu f4,4(r5)
lfsu f5,4(r5)
lfsu f6,4(r5)
lfsu f7,4(r5)
lfsu f8,4(r5)
lfsu f9,4(r5)
lfsu f10,4(r5)
lfsu f11,4(r5)
lfsu f12,4(r5)
lfsu f13,4(r5)
lfsu f14,4(r5)
lfsu f15,4(r5)
mtctr r3
subi r6,r6,4
subi r4,r4,4
lbz r11,0(r8)
lbz r12,0(r9)
.loop
li r10,0
lfsu f16,4(r6) ;f16 = ex
fmuls f19,f0,f16
fmuls f20,f1,f16
lfsu f17,4(r6) ;f17 = ey
fmuls f21,f2,f16
fmuls f22,f3,f16
fmadds f19,f4,f17,f19
lfsu f18,4(r6) ;f18 = ez
fmadds f20,f5,f17,f20
fmadds f21,f6,f17,f21
fmadds f22,f7,f17,f22
lfsu f23,4(r6) ;f23 = ew
fmadds f19,f8,f18,f19
fmadds f20,f9,f18,f20
fmadds f21,f10,f18,f21
fmadds f22,f11,f18,f22
fmadds f19,f12,f23,f19 ;f19 = cx
stfsu f19,4(r4)
fmadds f20,f13,f23,f20 ;f20 = cy
stfsu f20,4(r4)
fmadds f21,f14,f23,f21 ;f21 = cz
stfsu f21,4(r4)
fmadds f22,f15,f23,f22 ;f22 = cw
stfsu f22,4(r4)
fneg f16,f22 ;f16 = -cw
fcmpu f19,f22
ble .cont1
ori r10,r10,CLIP_RIGHT_BIT
b .cont2
.cont1
fcmpu f19,f16
bge .cont2
ori r10,r10,CLIP_LEFT_BIT
.cont2
fcmpu f20,f22
ble .cont3
ori r10,r10,CLIP_TOP_BIT
b .cont4
.cont3
fcmpu f20,f16
bge .cont4
ori r10,r10,CLIP_BOTTOM_BIT
.cont4
fcmpu f21,f22
ble .cont5
ori r10,r10,CLIP_FAR_BIT
b .cont6
.cont5
fcmpu f21,f16
bge .cont6
ori r10,r10,CLIP_NEAR_BIT
.cont6
mr. r10,r10
beq .cont7
lbz r0,0(r7)
or r0,r0,r10
stb r0,0(r7)
or r11,r11,r10
.cont7
and r12,r12,r10
addi r7,r7,1
bdnz .loop
stb r11,0(r8)
stb r12,0(r9)
.end
lfd f23,-10*8(r1)
lfd f22,-9*8(r1)
lfd f21,-8*8(r1)
lfd f20,-7*8(r1)
lfd f19,-6*8(r1)
lfd f18,-5*8(r1)
lfd f17,-4*8(r1)
lfd f16,-3*8(r1)
lfd f15,-2*8(r1)
lfd f14,-1*8(r1)
blr
ENDC
_asm_project_and_cliptest_identity
; GLuint i;
; for (i=0;i<n;i++) {
; GLfloat cx = vClip[i][0] = vEye[i][0];
; GLfloat cy = vClip[i][1] = vEye[i][1];
; GLfloat cz = vClip[i][2] = vEye[i][2];
; GLfloat cw = vClip[i][3] = vEye[i][3];
; GLubyte mask = 0;
; if (cx > cw) mask |= CLIP_RIGHT_BIT;
; else if (cx < -cw) mask |= CLIP_LEFT_BIT;
; if (cy > cw) mask |= CLIP_TOP_BIT;
; else if (cy < -cw) mask |= CLIP_BOTTOM_BIT;
; if (cz > cw) mask |= CLIP_FAR_BIT;
; else if (cz < -cw) mask |= CLIP_NEAR_BIT;
; if (mask) {
; clipMask[i] |= mask;
; tmpOrMask |= mask;
; }
; tmpAndMask &= mask;
; }
IFNE 1
mr. r3,r3
beq .end
mtctr r3
subi r5,r5,4
subi r4,r4,4
lbz r11,0(r7)
lbz r12,0(r8)
.loop
li r10,0
lfsu f0,4(r5)
stfsu f0,4(r4)
lfsu f1,4(r5)
stfsu f1,4(r4)
lfsu f2,4(r5)
stfsu f2,4(r4)
lfsu f3,4(r5)
stfsu f3,4(r4)
fabs f6,f1
fabs f7,f2
fabs f8,f3
fsubs f11,f6,f7
fsel f6,f11,f6,f7
fsubs f7,f6,f8
fsel f8,f7,f6,f8
fcmpu f8,f3
bgt .check
li r10,0
b .cont7
.check
fneg f4,f3
fcmpu f0,f3
ble .cont1
ori r10,r10,CLIP_RIGHT_BIT
b .cont2
.cont1
fcmpu f0,f4
bge .cont2
ori r10,r10,CLIP_LEFT_BIT
.cont2
fcmpu f1,f3
ble .cont3
ori r10,r10,CLIP_TOP_BIT
b .cont4
.cont3
fcmpu f1,f4
bge .cont4
ori r10,r10,CLIP_BOTTOM_BIT
.cont4
fcmpu f2,f3
ble .cont5
ori r10,r10,CLIP_FAR_BIT
b .cont6
.cont5
fcmpu f2,f4
bge .cont6
ori r10,r10,CLIP_NEAR_BIT
.cont6
mr. r10,r10
beq .cont7
lbz r0,0(r6)
or r0,r0,r10
stb r0,0(r6)
or r11,r11,r10
.cont7
and r12,r12,r10
addi r6,r6,1
bdnz .loop
stb r11,0(r7)
stb r12,0(r8)
.end
blr
ELSEIF
mr. r3,r3
beq .end
mtctr r3
subi r5,r5,4
subi r4,r4,4
lbz r11,0(r7)
lbz r12,0(r8)
.loop
li r10,0
lfsu f0,4(r5)
stfsu f0,4(r4)
lfsu f1,4(r5)
stfsu f1,4(r4)
lfsu f2,4(r5)
stfsu f2,4(r4)
lfsu f3,4(r5)
stfsu f3,4(r4)
fneg f4,f3
fcmpu f0,f3
ble .cont1
ori r10,r10,CLIP_RIGHT_BIT
b .cont2
.cont1
fcmpu f0,f4
bge .cont2
ori r10,r10,CLIP_LEFT_BIT
.cont2
fcmpu f1,f3
ble .cont3
ori r10,r10,CLIP_TOP_BIT
b .cont4
.cont3
fcmpu f1,f4
bge .cont4
ori r10,r10,CLIP_BOTTOM_BIT
.cont4
fcmpu f2,f3
ble .cont5
ori r10,r10,CLIP_FAR_BIT
b .cont6
.cont5
fcmpu f2,f4
bge .cont6
ori r10,r10,CLIP_NEAR_BIT
.cont6
mr. r10,r10
beq .cont7
lbz r0,0(r6)
or r0,r0,r10
stb r0,0(r6)
or r11,r11,r10
.cont7
and r12,r12,r10
addi r6,r6,1
bdnz .loop
stb r11,0(r7)
stb r12,0(r8)
.end
blr
ENDC
_asm_project_and_cliptest_ortho
; const GLfloat *m = ctx->ProjectionMatrix;
; GLfloat m0 = m[0], m5 = m[5], m10 = m[10], m12 = m[12];
; GLfloat m13 = m[13], m14 = m[14];
; GLuint i;
; for (i=0;i<n;i++) {
; GLfloat ex = vEye[i][0], ey = vEye[i][1];
; GLfloat ez = vEye[i][2], ew = vEye[i][3];
; GLfloat cx = m0 * ex + m12 * ew;
; GLfloat cy = m5 * ey + m13 * ew;
; GLfloat cz = m10 * ez + m14 * ew;
; GLfloat cw = ew;
; GLubyte mask = 0;
; vClip[i][0] = cx;
; vClip[i][1] = cy;
; vClip[i][2] = cz;
; vClip[i][3] = cw;
; if (cx > cw) mask |= CLIP_RIGHT_BIT;
; else if (cx < -cw) mask |= CLIP_LEFT_BIT;
; if (cy > cw) mask |= CLIP_TOP_BIT;
; else if (cy < -cw) mask |= CLIP_BOTTOM_BIT;
; if (cz > cw) mask |= CLIP_FAR_BIT;
; else if (cz < -cw) mask |= CLIP_NEAR_BIT;
; if (mask) {
; clipMask[i] |= mask;
; tmpOrMask |= mask;
; }
; tmpAndMask &= mask;
; }
IFNE 1
mr. r3,r3
beq .end
lfs f0,0*4(r5)
lfs f5,5*4(r5)
lfs f10,10*4(r5)
lfs f12,12*4(r5)
lfs f13,13*4(r5)
lfs f4,14*4(r5)
mtctr r3
subi r6,r6,4
subi r4,r4,4
lbz r11,0(r8)
lbz r12,0(r9)
.loop
li r10,0
lfsu f6,4(r6) ;f6 = ex
lfsu f7,4(r6) ;f7 = ey
fmuls f1,f0,f6
lfsu f8,4(r6) ;f8 = ez
fmuls f2,f5,f7
lfsu f9,4(r6) ;f9 = ew
fmuls f3,f10,f8
fmadds f1,f12,f9,f1 ;f1 = cx
stfsu f1,4(r4)
fmadds f2,f13,f9,f2 ;f2 = cy
stfsu f2,4(r4)
fmadds f3,f4,f9,f3 ;f3 = cz
stfsu f3,4(r4)
stfsu f9,4(r4) ;f9 = cw
fabs f6,f1
fabs f7,f2
fabs f8,f3
fsubs f11,f6,f7
fsel f6,f11,f6,f7
fsubs f7,f6,f8
fsel f8,f7,f6,f8
fcmpu f8,f9
bgt .check
li r10,0
b .cont7
.check
fneg f11,f9 ;f11 = -cw
fcmpu f1,f9
ble .cont1
ori r10,r10,CLIP_RIGHT_BIT
b .cont2
.cont1
fcmpu f1,f11
bge .cont2
ori r10,r10,CLIP_LEFT_BIT
.cont2
fcmpu f2,f9
ble .cont3
ori r10,r10,CLIP_TOP_BIT
b .cont4
.cont3
fcmpu f2,f11
bge .cont4
ori r10,r10,CLIP_BOTTOM_BIT
.cont4
fcmpu f3,f9
ble .cont5
ori r10,r10,CLIP_FAR_BIT
b .cont6
.cont5
fcmpu f3,f11
bge .cont6
ori r10,r10,CLIP_NEAR_BIT
.cont6
mr. r10,r10
beq .cont7
lbz r0,0(r7)
or r0,r0,r10
stb r0,0(r7)
or r11,r11,r10
.cont7
and r12,r12,r10
addi r7,r7,1
bdnz .loop
stb r11,0(r8)
stb r12,0(r9)
.end
blr
ELSEIF
mr. r3,r3
beq .end
lfs f0,0*4(r5)
lfs f5,5*4(r5)
lfs f10,10*4(r5)
lfs f12,12*4(r5)
lfs f13,13*4(r5)
lfs f4,14*4(r5)
mtctr r3
subi r6,r6,4
subi r4,r4,4
lbz r11,0(r8)
lbz r12,0(r9)
.loop
li r10,0
lfsu f6,4(r6) ;f6 = ex
lfsu f7,4(r6) ;f7 = ey
fmuls f1,f0,f6
lfsu f8,4(r6) ;f8 = ez
fmuls f2,f5,f7
lfsu f9,4(r6) ;f9 = ew
fmuls f3,f10,f8
fmadds f1,f12,f9,f1 ;f1 = cx
stfsu f1,4(r4)
fmadds f2,f13,f9,f2 ;f2 = cy
stfsu f2,4(r4)
fmadds f3,f4,f9,f3 ;f3 = cz
stfsu f3,4(r4)
stfsu f9,4(r4) ;f9 = cw
fneg f11,f9 ;f11 = -cw
fcmpu f1,f9
ble .cont1
ori r10,r10,CLIP_RIGHT_BIT
b .cont2
.cont1
fcmpu f1,f11
bge .cont2
ori r10,r10,CLIP_LEFT_BIT
.cont2
fcmpu f2,f9
ble .cont3
ori r10,r10,CLIP_TOP_BIT
b .cont4
.cont3
fcmpu f2,f11
bge .cont4
ori r10,r10,CLIP_BOTTOM_BIT
.cont4
fcmpu f3,f9
ble .cont5
ori r10,r10,CLIP_FAR_BIT
b .cont6
.cont5
fcmpu f3,f11
bge .cont6
ori r10,r10,CLIP_NEAR_BIT
.cont6
mr. r10,r10
beq .cont7
lbz r0,0(r7)
or r0,r0,r10
stb r0,0(r7)
or r11,r11,r10
.cont7
and r12,r12,r10
addi r7,r7,1
bdnz .loop
stb r11,0(r8)
stb r12,0(r9)
.end
blr
ENDC
_asm_project_and_cliptest_perspective
; const GLfloat *m = ctx->ProjectionMatrix;
; GLfloat m0 = m[0], m5 = m[5], m8 = m[8], m9 = m[9];
; GLfloat m10 = m[10], m14 = m[14];
; GLuint i;
; for (i=0;i<n;i++) {
; GLfloat ex = vEye[i][0], ey = vEye[i][1];
; GLfloat ez = vEye[i][2], ew = vEye[i][3];
; GLfloat cx = m0 * ex + m8 * ez ;
; GLfloat cy = m5 * ey + m9 * ez ;
; GLfloat cz = m10 * ez + m14 * ew;
; GLfloat cw = -ez ;
; GLubyte mask = 0;
; vClip[i][0] = cx;
; vClip[i][1] = cy;
; vClip[i][2] = cz;
; vClip[i][3] = cw;
; if (cx > cw) mask |= CLIP_RIGHT_BIT;
; else if (cx < -cw) mask |= CLIP_LEFT_BIT;
; if (cy > cw) mask |= CLIP_TOP_BIT;
; else if (cy < -cw) mask |= CLIP_BOTTOM_BIT;
; if (cz > cw) mask |= CLIP_FAR_BIT;
; else if (cz < -cw) mask |= CLIP_NEAR_BIT;
; if (mask) {
; clipMask[i] |= mask;
; tmpOrMask |= mask;
; }
; tmpAndMask &= mask;
; }
IFNE 1
mr. r3,r3
beq .end
lfs f0,0*4(r5)
lfs f5,5*4(r5)
lfs f8,8*4(r5)
lfs f9,9*4(r5)
lfs f10,10*4(r5)
lfs f4,14*4(r5)
mtctr r3
subi r6,r6,4
subi r4,r4,4
lbz r11,0(r8)
lbz r12,0(r9)
.loop
li r10,0
lfsu f6,4(r6) ;f6 = ex
lfsu f7,4(r6) ;f7 = ey
fmuls f1,f0,f6
lfsu f12,4(r6) ;f12 = ez
fmuls f2,f5,f7
lfsu f13,4(r6) ;f13 = ew
fmuls f3,f10,f12
fmadds f1,f8,f12,f1 ;f1 = cx
stfsu f1,4(r4)
fmadds f2,f9,f12,f2 ;f2 = cy
stfsu f2,4(r4)
fmadds f3,f4,f13,f3 ;f3 = cz
stfsu f3,4(r4)
fneg f13,f12 ;f13 = cw
stfsu f13,4(r4)
fabs f6,f1
fabs f7,f2
fabs f12,f3
fsubs f11,f6,f7
fsel f6,f11,f6,f7
fsubs f7,f6,f12
fsel f12,f7,f6,f12
fcmpu f12,f13
bgt .check
li r10,0
b .cont7
.check
fneg f11,f13 ;f11 = -cw
fcmpu f1,f13
ble .cont1
ori r10,r10,CLIP_RIGHT_BIT
b .cont2
.cont1
fcmpu f1,f11
bge .cont2
ori r10,r10,CLIP_LEFT_BIT
.cont2
fcmpu f2,f13
ble .cont3
ori r10,r10,CLIP_TOP_BIT
b .cont4
.cont3
fcmpu f2,f11
bge .cont4
ori r10,r10,CLIP_BOTTOM_BIT
.cont4
fcmpu f3,f13
ble .cont5
ori r10,r10,CLIP_FAR_BIT
b .cont6
.cont5
fcmpu f3,f11
bge .cont6
ori r10,r10,CLIP_NEAR_BIT
.cont6
mr. r10,r10
beq .cont7
lbz r0,0(r7)
or r0,r0,r10
stb r0,0(r7)
or r11,r11,r10
.cont7
and r12,r12,r10
addi r7,r7,1
bdnz .loop
stb r11,0(r8)
stb r12,0(r9)
.end
blr
ELSEIF
mr. r3,r3
beq .end
lfs f0,0*4(r5)
lfs f5,5*4(r5)
lfs f8,8*4(r5)
lfs f9,9*4(r5)
lfs f10,10*4(r5)
lfs f4,14*4(r5)
mtctr r3
subi r6,r6,4
subi r4,r4,4
lbz r11,0(r8)
lbz r12,0(r9)
.loop
li r10,0
lfsu f6,4(r6) ;f6 = ex
lfsu f7,4(r6) ;f7 = ey
fmuls f1,f0,f6
lfsu f12,4(r6) ;f12 = ez
fmuls f2,f5,f7
lfsu f13,4(r6) ;f13 = ew
fmuls f3,f10,f12
fmadds f1,f8,f12,f1 ;f1 = cx
stfsu f1,4(r4)
fmadds f2,f9,f12,f2 ;f2 = cy
stfsu f2,4(r4)
fmadds f3,f4,f13,f3 ;f3 = cz
stfsu f3,4(r4)
fneg f13,f12 ;f13 = cw
stfsu f13,4(r4)
fneg f11,f13 ;f11 = -cw
fcmpu f1,f13
ble .cont1
ori r10,r10,CLIP_RIGHT_BIT
b .cont2
.cont1
fcmpu f1,f11
bge .cont2
ori r10,r10,CLIP_LEFT_BIT
.cont2
fcmpu f2,f13
ble .cont3
ori r10,r10,CLIP_TOP_BIT
b .cont4
.cont3
fcmpu f2,f11
bge .cont4
ori r10,r10,CLIP_BOTTOM_BIT
.cont4
fcmpu f3,f13
ble .cont5
ori r10,r10,CLIP_FAR_BIT
b .cont6
.cont5
fcmpu f3,f11
bge .cont6
ori r10,r10,CLIP_NEAR_BIT
.cont6
mr. r10,r10
beq .cont7
lbz r0,0(r7)
or r0,r0,r10
stb r0,0(r7)
or r11,r11,r10
.cont7
and r12,r12,r10
addi r7,r7,1
bdnz .loop
stb r11,0(r8)
stb r12,0(r9)
.end
blr
ENDC
_asm_vp_map_vertices_now
; if (clipMask) {
; /* one or more vertices are clipped */
; GLuint i;
; for (i=0;i<n;i++) {
; if (clipMask[i]==0) {
; vWin[i][0] = vClip[i][0] * sx + tx;
; vWin[i][1] = vClip[i][1] * sy + ty;
; vWin[i][2] = vClip[i][2] * sz + tz;
; }
; }
; }
; else {
; /* no vertices are clipped */
; GLuint i;
; for (i=0;i<n;i++) {
; vWin[i][0] = vClip[i][0] * sx + tx;
; vWin[i][1] = vClip[i][1] * sy + ty;
; vWin[i][2] = vClip[i][2] * sz + tz;
; }
; }
mr. r4,r4
beq .done
subi r5,r5,4
mtctr r4
mr. r3,r3
beq .loop2
subi r3,r3,1
.loop1
lbzu r0,1(r3)
mr. r0,r0
bne .inc
lfs f7,0(r6)
fmadds f10,f1,f7,f4
lfs f8,4(r6)
fmadds f11,f2,f8,f5
lfs f9,8(r6)
fmadds f12,f3,f9,f6
stfsu f10,4(r5)
stfsu f11,4(r5)
stfsu f12,4(r5)
b .next
.inc
addi r5,r5,12
.next
addi r6,r6,16
bdnz .loop1
blr
.loop2
lfs f7,0(r6)
fmadds f10,f1,f7,f4
lfs f8,4(r6)
fmadds f11,f2,f8,f5
lfs f9,8(r6)
fmadds f12,f3,f9,f6
stfsu f10,4(r5)
stfsu f11,4(r5)
addi r6,r6,16
stfsu f12,4(r5)
bdnz .loop2
.done
blr
_asm_vp_map_vertices
; if (clipMask) {
; /* one or more vertices are clipped */
; GLuint i;
; for (i=0;i<n;i++) {
; if (clipMask[i] == 0) {
; if (vClip[i][3] != 0.0F) {
; GLfloat wInv = 1.0F / vClip[i][3];
; vWin[i][0] = vClip[i][0] * wInv * sx + tx;
; vWin[i][1] = vClip[i][1] * wInv * sy + ty;
; vWin[i][2] = vClip[i][2] * wInv * sz + tz;
; }
; else {
; /* Div by zero! Can't set window coords to infinity, so...*/
; vWin[i][0] = 0.0F;
; vWin[i][1] = 0.0F;
; vWin[i][2] = 0.0F;
; }
; }
; }
; }
; else {
; /* no vertices are clipped */
; GLuint i;
; for (i=0;i<n;i++) {
; if (vClip[i][3] != 0.0F) {
; GLfloat wInv = 1.0F / vClip[i][3];
; vWin[i][0] = vClip[i][0] * wInv * sx + tx;
; vWin[i][1] = vClip[i][1] * wInv * sy + ty;
; vWin[i][2] = vClip[i][2] * wInv * sz + tz;
; }
; else {
; /* Divide by zero! Can't set window coords to infinity, so...*/
; vWin[i][0] = 0.0F;
; vWin[i][1] = 0.0F;
; vWin[i][2] = 0.0F;
; }
; }
; }
stfd f31,-8(r1)
stfd f30,-16(r1)
stfd f29,-24(r1)
stfd f28,-32(r1)
stfd f27,-40(r1)
stfd f26,-48(r1)
mr. r4,r4
beq .done
subi r5,r5,4
mr. r3,r3
lfs f12,fp_0(r2)
lfs f11,fp_2(r2)
beq .loop2
.loop1
lbz r0,0(r3)
mr. r0,r0
bne .inc
lfs f13,12(r6)
fcmpu f13,f12
beq .zero
fres f13,f13
lfs f7,0(r6)
fmuls f7,f7,f13
lfs f8,4(r6)
fmadds f7,f1,f7,f4
fmuls f8,f8,f13
lfs f9,8(r6)
fmadds f8,f2,f8,f5
stfsu f7,4(r5)
fmuls f9,f9,f13
stfsu f8,4(r5)
fmadds f9,f3,f9,f6
stfsu f9,4(r5)
b .next
.zero
li r0,0
stwu r0,4(r5)
stwu r0,4(r5)
stwu r0,4(r5)
b .next
.inc
addi r5,r5,12
.next
addi r6,r6,16
addi r3,r3,1
subic. r4,r4,1
bne .loop1
b .done
.loop2
cmplwi r4,4
blt .normal
/* quad parallel turbo division */
lfs f8,12(r6)
lfs f9,12+16(r6)
lfs f10,12+2*16(r6)
lfs f13,12+3*16(r6)
fmuls f0,f8,f8
fmuls f31,f9,f9
fmuls f29,f10,f10
fmuls f27,f13,f13
fcmpu f0,f12
beq .skip1
frsqrte f0,f0
.skip1
fcmpu f31,f12
beq .skip2
frsqrte f31,f31
.skip2
fcmpu f29,f12
beq .skip3
frsqrte f29,f29
.skip3
fcmpu f27,f12
beq .skip4
frsqrte f27,f27
.skip4
fnmsubs f7,f0,f8,f11
fnmsubs f30,f31,f9,f11
fnmsubs f28,f29,f10,f11
fnmsubs f26,f27,f13,f11
fmuls f0,f0,f7
fmuls f31,f31,f30
fmuls f29,f29,f28
fmuls f27,f27,f26
fnmsubs f7,f0,f8,f11
fnmsubs f30,f31,f9,f11
fnmsubs f28,f29,f10,f11
fnmsubs f26,f27,f13,f11
fmuls f0,f0,f7
fmuls f31,f31,f30
fmuls f29,f29,f28
fmuls f27,f27,f26
fnmsubs f7,f0,f8,f11
fnmsubs f30,f31,f9,f11
fnmsubs f28,f29,f10,f11
fnmsubs f26,f27,f13,f11
fmuls f0,f0,f7
fmuls f31,f31,f30
fmuls f29,f29,f28
fmuls f27,f27,f26
lfs f7,0(r6)
fmuls f7,f7,f0
lfs f8,4(r6)
fmadds f7,f1,f7,f4
fmuls f8,f8,f0
lfs f9,8(r6)
fmadds f8,f2,f8,f5
stfsu f7,4(r5)
fmuls f9,f9,f0
stfsu f8,4(r5)
fmadds f9,f3,f9,f6
addi r6,r6,16
stfsu f9,4(r5)
lfs f7,0(r6)
fmuls f7,f7,f31
lfs f8,4(r6)
fmadds f7,f1,f7,f4
fmuls f8,f8,f31
lfs f9,8(r6)
fmadds f8,f2,f8,f5
stfsu f7,4(r5)
fmuls f9,f9,f31
stfsu f8,4(r5)
fmadds f9,f3,f9,f6
addi r6,r6,16
stfsu f9,4(r5)
lfs f7,0(r6)
fmuls f7,f7,f29
lfs f8,4(r6)
fmadds f7,f1,f7,f4
fmuls f8,f8,f29
lfs f9,8(r6)
fmadds f8,f2,f8,f5
stfsu f7,4(r5)
fmuls f9,f9,f29
stfsu f8,4(r5)
fmadds f9,f3,f9,f6
addi r6,r6,16
stfsu f9,4(r5)
lfs f7,0(r6)
fmuls f7,f7,f27
lfs f8,4(r6)
fmadds f7,f1,f7,f4
fmuls f8,f8,f27
lfs f9,8(r6)
fmadds f8,f2,f8,f5
stfsu f7,4(r5)
fmuls f9,f9,f27
stfsu f8,4(r5)
fmadds f9,f3,f9,f6
addi r6,r6,16
stfsu f9,4(r5)
subic. r4,r4,4
bne .loop2
b .done
.normal
lfs f13,12(r6)
fcmpu f13,f12
beq .zero2
fres f13,f13
lfs f7,0(r6)
fmuls f7,f7,f13
lfs f8,4(r6)
fmadds f7,f1,f7,f4
fmuls f8,f8,f13
lfs f9,8(r6)
fmadds f8,f2,f8,f5
stfsu f7,4(r5)
fmuls f9,f9,f13
stfsu f8,4(r5)
fmadds f9,f3,f9,f6
addi r6,r6,16
stfsu f9,4(r5)
b .next2
.zero2
li r0,0
stwu r0,4(r5)
stwu r0,4(r5)
stwu r0,4(r5)
addi r6,r6,16
.next2
subic. r4,r4,1
bne .loop2
.done
lfd f26,-48(r1)
lfd f27,-40(r1)
lfd f28,-32(r1)
lfd f29,-24(r1)
lfd f30,-16(r1)
lfd f31,-8(r1)
blr
section data
fp_0 dc.s 0
fp_2 dc.s 2